pacman::p_load(tidyverse, data.table, broom, readxl)
rm(list=ls())
`%!in%` = Negate(`%in%`)
################################################################################

###### Spatial units

spatial_id_a = 'county_id'
spatial_id_b = 'amc_id'

###### Baseline variables

for (country in c('argentina','brazil') ){
  
  df_l = fread(paste0('../../data/landuse/clean/landuse_',country,'_county.csv.gz'))
  df_u = fread(paste0('../../data/landuse/clean/geographicunits_',country,'.csv.gz'))
  df_yc = fread(paste0('../../data/agronomy/clean/faogaez_',country,'.csv.gz'))
  df_yp = fread('../../data/agronomy/clean/pastureproductivity_argentinabrazil.csv.gz')[unit=='head per hectare',]
  setnames(df_yp, old = c('int_mean'),  new = c('yield'))
  setnames(df_yc, old = c('int_mean'),  new = c('yield'))

  if (country=='argentina'){
    
    #Spatial unit crosswalk
    df_u = df_u[, list(county_id, state_id, region)]
    
    #Land
    df = merge(df_l, df_u, by='county_id')
    df = df[, c('pasture') := list(pasture+forage_seasonal+forage_permanent)]
    setnames(df, old = c(spatial_id_a),  new = c('spatial_id'))
    df_land_a = df[,  list(year, region, state_id, spatial_id,pasture,soybean, maize, wheat, sunflower)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, region, state_id, spatial_id)][, unit:='hectares']
    
    #Yields
    yield_list_a = c('soybean','maize','wheat','sunflower')
    df_yp = df_yp[country=='ARGENTINA', list(county_id, crop, yield, unit)][, lapply(.SD, mean, na.rm=TRUE), by=list(county_id, crop, unit)]
    df_yc = df_yc[crop %in% yield_list_a, list(county_id, crop, yield)][, lapply(.SD, mean, na.rm=TRUE), by=list(county_id, crop)][, unit:='tonne per hectare']
    df = merge(na.omit(rbind(df_yp,df_yc), cols="yield"), df_u, by='county_id')
    setnames(df, old = c(spatial_id_a, 'crop'),  new = c('spatial_id','commodity'))
    df_yield_a = df[, list(region, state_id, spatial_id, commodity, yield, unit)][, lapply(.SD, mean, na.rm=TRUE), by=list(region, state_id, spatial_id, commodity, unit)]
    
    #Farm-gate commodity prices
    df = fread(paste0('../../data/prices/clean/farmgate_crops_argentina_',spatial_id_a,'.csv.gz'))
    df=df[year!=2018,]
    df[year==2017,]$year=2018
    df_pc = df[commodity %in% yield_list_a, list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per tonne']
    df = fread(paste0('../../data/prices/clean/farmgate_cattle_argentina_',spatial_id_a,'.csv.gz'))[, commodity:='pasture']
    setnames(df, old = c('value'),  new = c('price'))
    df_pp = df[establishment_type %in% c('all establishments') & value_type %in% c('price_per_head'), list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per head']
    df_prices_a = rbind(df_pc, df_pp)
    
    #Payoffs per ha
    df_x = gather(df_land_a, landuse, acreage, pasture:sunflower, factor_key=TRUE)
    df_y = merge(df_prices_a, df_yield_a, by=c('region', 'state_id', 'spatial_id', 'commodity'))
    setnames(df_y, old = c('commodity'),  new = c('landuse'))
    
    df_ols_a = merge(df_y, df_x, by=c('year','region', 'state_id','spatial_id', 'landuse'))[,list(year, region, state_id, spatial_id, landuse, acreage, price, yield)]
    df_ols_a$country='ARGENTINA'    
    
  } else {
    
    #Spatial unit crosswalk
    df_u = df_u[, list(county_id, amc_id, microregion_id, mesoregion_id, state_id, region)]
    
    #Land
    df = merge(df_l, df_u, by='county_id')
    df = df[, c('pasture') := list(pasture_natural+pasture_planted)]
    setnames(df, old = c(spatial_id_b),  new = c('spatial_id'))
    df_land_b = df[,  list(year, region, state_id, spatial_id,pasture,soybean, maize, wheat, rice, sugarcane)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, region, state_id, spatial_id)][, unit:='hectares']
    
    #Yields
    yield_list_b = c('soybean','maize','wheat','rice','sugarcane')
    df_yp = df_yp[country=='BRAZIL', list(county_id, crop, yield, unit)][, lapply(.SD, mean, na.rm=TRUE), by=list(county_id, crop, unit)]
    df_yc = df_yc[crop %in% yield_list_b, list(county_id, crop, yield)][, lapply(.SD, mean, na.rm=TRUE), by=list(county_id, crop)][, unit:='tonne per hectare']
    df = merge(na.omit(rbind(df_yp,df_yc), cols="yield"), df_u, by='county_id')
    setnames(df, old = c(spatial_id_b, 'crop'),  new = c('spatial_id','commodity'))
    df_yield_b = df[, list(region, state_id, spatial_id, commodity, yield, unit)][, lapply(.SD, mean, na.rm=TRUE), by=list(region, state_id, spatial_id, commodity, unit)]
    
    #Farm-gate commodity prices
    df = fread(paste0('../../data/prices/clean/farmgate_crops_brazil_',spatial_id_b,'.csv.gz'))
    df_pc = df[commodity %in% yield_list_b, list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per tonne']
    df = fread(paste0('../../data/prices/clean/farmgate_cattle_brazil_',spatial_id_b,'.csv.gz'))[, commodity:='pasture']
    setnames(df, old = c('value'),  new = c('price'))
    df_pp = df[establishment_type %in% c('cattle establishments') & value_type %in% c('breeders_p','calves_p'), list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per head']
    df_prices_b = rbind(df_pc, df_pp)
    
    #Payoffs per ha
    df_x = gather(df_land_b, landuse, acreage, pasture:sugarcane, factor_key=TRUE)
    df_y = merge(df_prices_b, df_yield_b, by=c('region', 'state_id', 'spatial_id', 'commodity'))
    setnames(df_y, old = c('commodity'),  new = c('landuse'))
    
    df_ols_b = merge(df_y, df_x, by=c('year','region', 'state_id','spatial_id', 'landuse'))[,list(year, region, state_id, spatial_id, landuse, acreage, price, yield)]
    df_ols_b$country='BRAZIL'
    
  }
}
df_ols = rbind(df_ols_a,df_ols_b)

###### Factors

df_factors = setDT(read_excel('../../data/factors/raw/factor_shares.xlsx'))[,c('commodity','land_share')]
setnames(df_factors, old='commodity', new='landuse')
df_ols = merge(df_ols, df_factors, by='landuse', all.x=T)
df_ols = df_ols[, c('payoff_per_ha'):=list( (price^(1/land_share))*yield )]
df_ols = df_ols[, c("year","region","state_id","spatial_id","landuse","acreage","price","yield","payoff_per_ha","country")]

###### Relative payoffs

n=0
for (landuse_item in c('maize','sunflower','wheat','rice','sugarcane','pasture')){
    
  df_aux = merge(df_ols[landuse==landuse_item,], df_ols[landuse=='soybean',], by=c('year','country','region','state_id','spatial_id'))
  df_aux = df_aux[,c('relative_acreage','relative_payoff') := list(acreage.y/acreage.x,payoff_per_ha.y/payoff_per_ha.x) ]
  
  if(n==0){df = df_aux}else{df = rbind(df,df_aux)}
  n=n+1
}
setnames(df, old=c('landuse.x'), new=c('landuse'))
df_ols_relative = df[, list(year, country, region, state_id, spatial_id, landuse, relative_acreage, relative_payoff)]

###### Exposure shares
id_type_a = c('county_known')
id_type_b = c('county_known')
year_max = 2018
for (country in c('argentina','brazil') ){
  
  if (country=='argentina'){
    
    df = fread(paste0('../../data/agribusiness/clean/soybean_',country,'.csv.gz'))[id_type %in% id_type_a & state %!in% c('UNKNOWN','IMPORTS + STOCK') & destination_bloc != 'UNKNOWN',]
    setnames(df, old = c(spatial_id_a),  new = c('spatial_id'))
    dfx = df[, list(year, country, region, state_id, spatial_id,destination_bloc, equivalent_tonnes, commodity)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, destination_bloc, commodity)]
    dfy = df[, list(year, country, region, state_id, spatial_id,equivalent_tonnes, commodity)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, commodity)]
    dfxy = merge(dfx,dfy, by=c('year','country','region','state_id','spatial_id','commodity'))[, c('exposure_share') := list(equivalent_tonnes.x/equivalent_tonnes.y)][, list(year, country, region, state_id, spatial_id, destination_bloc, exposure_share, commodity)]
    df_a = dfxy[year<=year_max,][, c("year") := NULL][, lapply(.SD, mean, na.rm=TRUE),  by=list(country, region, state_id, spatial_id, destination_bloc, commodity)]
    
    }else{
      
      n_c=1
      for (commodity in c('soybean','maize','beef')) {
        
        df = fread(paste0('../../data/agribusiness/clean/',commodity,'_',country,'.csv.gz'))[id_type %in% id_type_b & state !='UNKNOWN STATE' & destination != 'BRAZIL' & destination_bloc != 'UNKNOWN',]
        setnames(df, old = c(spatial_id_b),  new = c('spatial_id'))
        dfx = df[, list(year, country, region, state_id, spatial_id,destination_bloc,equivalent_tonnes, commodity)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, destination_bloc, commodity)]
        dfy = df[, list(year, country, region, state_id, spatial_id,equivalent_tonnes, commodity)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, commodity)]
        dfxy = merge(dfx,dfy, by=c('year','country','region','state_id','spatial_id','commodity'))[, c('exposure_share') := list(equivalent_tonnes.x/equivalent_tonnes.y)][, list(year, country, region, state_id, spatial_id, destination_bloc, exposure_share, commodity)]
        df_b_c = dfxy[year<=year_max,][, c("year") := NULL][, lapply(.SD, mean, na.rm=TRUE),  by=list(country, region, state_id, spatial_id, destination_bloc,commodity)]
        
        if(n_c==1){df_b = df_b_c}else{df_b = rbind(df_b,df_b_c)}
        
        n_c=n_c+1
      } 
    }
}

df_iv_shares = rbind(df_a,df_b)
df_iv_shares$landuse = df_iv_shares$commodity
df_iv_shares[commodity=='beef',]$landuse='pasture'

###### Temporal variation

df = fread('../../data/trade/clean/imports_from_world_iv.csv.gz')
df$landuse = df$item
df[landuse=='MAIZE']$landuse = 'maize'
df[landuse=='OIL, SUNFLOWER']$landuse = 'sunflower'
df[landuse=='RICE, MILLED']$landuse = 'rice'
df[landuse=='SOYBEANS']$landuse = 'soybean'
df[landuse=='WHEAT']$landuse = 'wheat'
df[landuse=='SUGAR RAW CENTRIFUGAL']$landuse = 'sugarcane'
df[landuse=='MEAT, CATTLE, BONELESS (BEEF & VEAL)']$landuse = 'pasture'
df[unit=='TONNES']$unit = 'tonnes'
df[unit=='USD']$unit = 'usd'
df_c = fread('../../data/trade/raw/destinations_trase_faostat_crosswalk.csv')[, list(destination_bloc, destination)]
df = merge(df_c, df, by=c('destination'))
destination_list = data.frame(unique(df$destination))
product_type_landuse=c('maize','pasture','sunflower','rice','soybean','sugarcane','wheat')
df_iv_t = df[landuse %in% product_type_landuse, list(year, destination_bloc, landuse, unit, residual_imports_from_world)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, destination_bloc, landuse, unit)]

df = df_iv_t
df$year_agg = 0
df[year %in% c(2000,2001,2002,2003,2004)]$year_agg = 2002
df[year %in% c(2006,2007,2008,2009,2010)]$year_agg = 2008
df[year %in% c(1993,1994,1995,1996,1997)]$year_agg = 1995
df[year %in% c(2003,2004,2006,2007,2008)]$year_agg = 2006
df[year %in% c(2015,2016,2017,2018,2019)]$year_agg = 2017
setnames(df, old=c('year','year_agg'), new=c('year_agg','year'))
df[, year_agg:=NULL]
df_iv_t_agg =  df[year %in% c(1995,2002,2008,2006,2017), lapply(.SD, mean, na.rm=TRUE), by=list(year, destination_bloc, landuse, unit)]
df_iv_t_agg2 = df_iv_t_agg[year==2017,]
df_iv_t_agg2$year = 2018
df_iv_t_agg = rbind(df_iv_t_agg,df_iv_t_agg2)

###### Interaction

df_x = df_iv_t_agg[landuse %!in% c('soybean','maize','pasture')]
df_y = df_iv_t_agg[landuse=='soybean',]
df = merge(df_x,df_y[, landuse:=NULL], by=c('year','destination_bloc','unit'))[,  c('relative_demand_shift') := list(residual_imports_from_world.y/residual_imports_from_world.x)][, list(year, destination_bloc, landuse, unit, relative_demand_shift)]
df = dcast(df, year + destination_bloc + landuse ~ unit, value.var = "relative_demand_shift")
setnames(df, old=c('usd'), new=c('relative_demand_shift'))
df_iv_t_wide = df
df = merge(df_iv_shares[landuse=='soybean', list(country, spatial_id, destination_bloc,exposure_share)], df_iv_t_wide, by=c('destination_bloc'), allow.cartesian=TRUE)
df[, c('relative_shock') := list(exposure_share*relative_demand_shift)]
df_iv_relative1 = df[, list(year, country, spatial_id, landuse, relative_shock)][, lapply(.SD, sum, na.rm=TRUE), by=list(year,country, spatial_id, landuse)]

df = dcast(df_iv_t_agg,  year + destination_bloc + landuse ~ unit, value.var = "residual_imports_from_world")
setnames(df, old=c('usd'), new=c('demand_shift'))
df_iv_t_wide = df
df = merge(df_iv_shares[, list(country, spatial_id, destination_bloc, exposure_share, landuse)], df_iv_t_wide, by=c('landuse','destination_bloc'), allow.cartesian=TRUE)
df[, c('demand') := list(exposure_share*demand_shift)]
df_iv_level = df[, list(year, country, spatial_id, landuse, demand)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, spatial_id, landuse)]
df_x = df_iv_level[landuse %!in% c('soybean'),]
df_y = df_iv_level[landuse=='soybean',]
df_iv_relative2 = merge(df_x,df_y[, landuse:=NULL], by=c('year','country','spatial_id'))[,  c('relative_shock') := list(demand.y/demand.x )][, list(year, country, spatial_id, landuse, relative_shock)]

df_iv_relative = rbind(df_iv_relative1, df_iv_relative2)

###### Final dataset

df_within = merge(df_ols_relative, df_iv_relative,  by=c('year','country','spatial_id','landuse'), all.x=T)
df_final = df_within[,c("year","country","spatial_id","landuse","region","state_id","relative_acreage","relative_payoff","relative_shock")]
write.csv(df_final, gzfile("inputs/data_supply_within.csv.gz"), row.names = FALSE)



  
